Predicting New Construction in Philadelphia

MUSA 508 Final Project

Author

Laura Frances and Nissim Lebovits

Published

December 7, 2023

1 Summary

2 Introduction

Show the code
required_packages <- c("tidyverse", "sf", "acs", "tidycensus", "sfdep", "kableExtra", "conflicted",
                       "gganimate", "tmap", "gifski", "transformr", "ggpubr", "randomForest", "janitor",
                       'igraph', "plotly")
install_and_load_packages(required_packages)

source("utils/viz_utils.R")
  1. Predict development pressure: how do we define “a lot of development”?

  2. Define affordability burden: how do we define “affordability burden”? – % change year over year in population that is experience rate burden (will probably see extreme tipping points), growing population, % change in area incomes

  3. Identify problem zoning

  4. Calculate number of connected parcels

  5. Predict development pressure at the block level

  6. Identify not burdened areas

  7. Identify problem zoning

  8. Calcualte number of connected parcels

  9. Advocate for upzoning in parcels where there is local development pressure, no affordability burden, problem zoning, and high number of connected parcels

  • transit
  • zoning (OSM)
  • land use (OSM)
Show the code
urls <- c(
  roads = 'https://opendata.arcgis.com/datasets/261eeb49dfd44ccb8a4b6a0af830fdc8_0.geojson', # for broad and market
  council_dists = "https://opendata.arcgis.com/datasets/9298c2f3fa3241fbb176ff1e84d33360_0.geojson",
  building_permits = building_permits_path,
  permits_bg = final_dataset_path,
  zoning = "https://opendata.arcgis.com/datasets/0bdb0b5f13774c03abf8dc2f1aa01693_0.geojson",
  opa = "data/opa_properties.geojson"
)

suppressMessages({
  invisible(
    imap(urls, ~ assign(.y, phl_spat_read(.x), envir = .GlobalEnv))
  )
})

broad_and_market <- roads %>% filter(ST_NAME %in% c('BROAD',"MARKET") | SEG_ID %in% c(440370, 421347,421338,421337,422413,423051,440403,440402,440391,440380))

council_dists <- council_dists %>%
                    select(DISTRICT)

building_permits <- building_permits %>%
                      filter(permittype %in% c("RESIDENTIAL BUILDING", "BP_ADDITION", "BP_NEWCNST"))
Show the code
tm_shape(permits_bg %>% filter(year == 2023)) +
        tm_polygons(col = palette[4], alpha = 0.5, colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "darkgrey") +
  tm_layout(frame = FALSE, title = "Philadelphia Block Groups") 

Show the code
# tm_out <- tm_shape(acs22) +
#         tm_polygons(col = "Ext_Rent_Burden", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Extreme Rent Burden") +
#   tm_shape(broad_and_market) +
#   tm_lines(col = "darkgrey") +
#   tm_layout(frame = FALSE, title = "Extreme Rent Burden\nPhiladelphia, 2022") 
# 
# tmap_save(tm_out, "assets/extreme_rent_burden_2022.jpg", dpi = 500)

tm <- tm_shape(permits_bg %>% filter(!year %in% c(2012, 2023))) +
        tm_polygons(col = "permits_count", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey") +
  tm_facets(along = "year") +
  tm_shape(broad_and_market) +
  tm_lines(col = "darkgrey") +
  tm_layout(frame = FALSE) 

suppressMessages(
tmap_animation(tm, "assets/permits_animation.gif", delay = 50)
)

Philadelphia Building Permits, 2013 - 2022

3 Methods

3.1 Data

Show the code
ggplot(building_permits %>% filter(year != 2023), aes(x = as.factor(year))) +
  geom_bar() +
  labs(title = "Permits per Year")

Show the code
ggplot(permits_bg %>% st_drop_geometry %>% filter(year != 2023), aes(x = permits_count)) +
  geom_histogram() +
  labs(title = "Permits per Block Group per Year",
       subtitle = "Log-Transformed") +
  scale_x_log10() +
  facet_wrap(~year)

Show the code
# 
# tm_shape(permits_bg) +
#   tm_polygons("spat_lag_permits", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
#   tm_facets(along = "year") +
# tm_shape(broad_and_market) +
#   tm_lines(col = "lightgrey") +
#   tm_layout(frame = FALSE)
Show the code
perms_x_dist <- st_join(building_permits, council_dists)

perms_x_dist_sum <- perms_x_dist %>%
                  st_drop_geometry() %>%
                  group_by(DISTRICT, year) %>%
                  summarize(permits_count = n())

perms_x_dist_mean = perms_x_dist_sum %>%
                      group_by(year) %>%
                      summarize(permits_count = mean(permits_count)) %>%
                      mutate(DISTRICT = "Average")

perms_x_dist_sum <- bind_rows(perms_x_dist_sum, perms_x_dist_mean) %>%
                        mutate(color = ifelse(DISTRICT != "Average", 0, 1))

ggplotly(
ggplot(perms_x_dist_sum %>% filter(year > 2013, year < 2023), aes(x = year, y = permits_count, color = as.character(color), group = interaction(DISTRICT, color))) +
  geom_line(lwd = 0.5) +
  labs(title = "Permits per Year by Council District",
       y = "Total Permits") +
  # facet_wrap(~DISTRICT) +
  theme_minimal() +
  theme(axis.title.x = element_blank()) +
  scale_color_manual(values = c(palette[5], palette[2]))
)

3.1.1 Feature Engineering

Show the code
permits_bg_long <- permits_bg %>%
                    filter(year != 2023) %>%
                    st_drop_geometry() %>%
                    pivot_longer(
                      cols = c(starts_with("lag"), dist_to_2022),
                      names_to = "Variable",
                      values_to = "Value"
                    )


ggscatter(permits_bg_long, x = "permits_count", y = "Value", facet.by = "Variable",
   add = "reg.line",
   add.params = list(color = palette[3], fill = palette[5]),
   conf.int = TRUE
   ) + stat_cor(method = "pearson", p.accuracy = 0.001, r.accuracy = 0.01)

3.2 Models

4 Results

4.1 OLS Regression

To begin, we run a simple regression incorporating three engineered groups of features: space lag, time lag, and distance to 2022. We include this last variable because of a Philadelphia tax abatement policy that led to a significant increase in residential development in the years immediately before 2022. We will use this as a baseline model to compare to our more complex models.

Show the code
permits_train <- filter(permits_bg %>% select(-mapname), year < 2022)
permits_test <- filter(permits_bg %>% select(-mapname), year == 2022)
permits_validate <- filter(permits_bg %>% select(-mapname), year == 2023)

reg <- lm(permits_count ~ ., data = st_drop_geometry(permits_train))

predictions <- predict(reg, permits_test)
predictions <- cbind(permits_test, predictions)

predictions <- predictions %>%
                  mutate(abs_error = abs(permits_count - predictions),
                         pct_error = abs_error / permits_count)

ggplot(predictions, aes(x = permits_count, y = predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
mae <- paste0("MAE: ", round(mean(predictions$abs_error, na.rm = TRUE), 2))

tm_shape(predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

4.2 Poisson Model

Given that we are dealing with a skewed distribution of count data

Show the code
poisson_reg <- glm(permits_count ~ ., 
                   family = poisson(link = "log"),
                   data = st_drop_geometry(permits_train))

poisson_predictions <- predict(poisson_reg, permits_test, type = "response")
poisson_predictions <- cbind(permits_test, poisson_predictions)

poisson_predictions <- poisson_predictions %>%
                           mutate(abs_error = abs(permits_count - poisson_predictions),
                                  pct_error = abs_error / permits_count)

ggplot(poisson_predictions, aes(x = permits_count, y = poisson_predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
poisson_mae <- paste0("MAE: ", round(mean(poisson_predictions$abs_error, na.rm = TRUE), 2))

tm_shape(poisson_predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

We find that our OLS model has an MAE of only MAE: 2.25–not bad for such a simple model! Still, it struggles most in the areas where we most need it to succeed, so we will try to introduce better variables and apply a more complex model to improve our predictions.

4.3 Random Forest Regression

Show the code
rf <- randomForest(permits_count ~ ., 
                   data = st_drop_geometry(permits_train),
                   importance = TRUE, 
                   na.action = na.omit)

rf_predictions <- predict(rf, permits_test)
rf_predictions <- cbind(permits_test, rf_predictions)
rf_predictions <- rf_predictions %>%
                  mutate(abs_error = abs(permits_count - rf_predictions),
                         pct_error = abs_error / (permits_count + 0.0001))

ggplot(rf_predictions, aes(x = permits_count, y = rf_predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
rf_mae <- paste0("MAE: ", round(mean(rf_predictions$abs_error, na.rm = TRUE), 2))

tm_shape(rf_predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

5 Discussion

5.1 Accuracy

Predominately, our model overpredicts, which is better than underpredicting, as it facilitates new development.

Show the code
nbins <- as.integer(sqrt(nrow(rf_predictions)))
vline <- mean(rf_predictions$abs_error, na.rm = TRUE)

ggplot(rf_predictions, aes(x = abs_error)) +
  geom_histogram(bins = nbins) +
  geom_vline(aes(xintercept = vline))

Show the code
hmm <- permits_bg %>%
  st_drop_geometry() %>%
  group_by(year) %>%
  summarize_all(.funs = list(~sum(is.na(.)))) # Check NA for all columns

5.2 Generalizabiltiy

Show the code
# tm_shape(acs22) +
#         tm_polygons(col = "Percent_Nonwhite", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey") +
#   tm_shape(broad_and_market) +
#   tm_lines(col = "lightgrey") +
#   tm_layout(frame = FALSE)
# 
# 
# tm_shape(acs22) +
#         tm_polygons(col = "Ext_Rent_Burden", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey") +
#   tm_shape(broad_and_market) +
#   tm_lines(col = "lightgrey") +
#   tm_layout(frame = FALSE)

rf_predictions <- rf_predictions %>%
                      mutate(race_comp = case_when(
                        percent_nonwhite >= .50 ~ "Majority Non-White",
                        TRUE ~ "Majority White"
                      ))

ggplot(rf_predictions, aes(y = abs_error, color = race_comp)) +
  geom_boxplot(fill = NA)

How does this generalize across neighborhoods?

How does this generalize across council districts?

5.3 Assessing Upzoning Needs

We can identify conflict between projected development and current zoning.

Look at zoning that is industrial or residential single family in areas that our model suggests are high development risk for 2023:

Show the code
filtered_zoning <- zoning %>%
                     filter(str_detect(CODE, "RS") | str_detect(CODE, "I"),
                            CODE != "I2")


tm_shape(filtered_zoning) +
        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

We can extract development predictions at the block level to these parcels and then visualize them by highest need.

Show the code
filtered_zoning <- st_join(
  filtered_zoning,
  rf_predictions %>% select(rf_predictions)
)

tm_shape(filtered_zoning) +
        tm_polygons(col = "rf_predictions", border.alpha = 0, colorNA = "lightgrey", palette = "viridis", style = "fisher") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Show the code
tmap_mode('view')

filtered_zoning %>%
  filter(rf_predictions > 10) %>%
tm_shape() +
        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey",
                    popup.vars = c('rf_predictions', 'CODE')) +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Furthermore, we can identify properties with high potential for assemblage, which suggests the ability to accomodate high-density, multi-unit housing.

Show the code
nbs <- filtered_zoning %>% 
  mutate(nb = st_contiguity(geometry))

# Create edge list while handling cases with no neighbors
edge_list <- tibble::tibble(id = 1:length(nbs$nb), nbs = nbs$nb) %>% 
  tidyr::unnest(nbs) %>% 
  filter(nbs != 0)

# Create a graph with a node for each row in filtered_zoning
g <- make_empty_graph(n = nrow(filtered_zoning))
V(g)$name <- as.character(1:nrow(filtered_zoning))

# Add edges if they exist
if (nrow(edge_list) > 0) {
  edges <- as.matrix(edge_list)
  g <- add_edges(g, c(t(edges)))
}

# Calculate the number of contiguous neighbors, handling nodes without neighbors
n_contiguous <- sapply(V(g)$name, function(node) {
  if (node %in% edges) {
    length(neighborhood(g, order = 1, nodes = as.numeric(node))[[1]])
  } else {
    1  # Nodes without neighbors count as 1 (themselves)
  }
})

filtered_zoning <- filtered_zoning %>%
                    mutate(n_contig = n_contiguous)

filtered_zoning %>%
  st_drop_geometry() %>%
  select(rf_predictions,
         n_contig,
         OBJECTID,
         CODE) %>%
  filter(rf_predictions > 10,
         n_contig > 2) %>%
  arrange(desc(rf_predictions)) %>%
  kablerize(caption = "Poorly-Zoned Properties with High Development Risk")
Poorly-Zoned Properties with High Development Risk
rf_predictions n_contig OBJECTID CODE
7575 38.37310 3 16717 RSA5
4993 34.18023 3 10410 ICMX
4994 34.18023 3 10411 RSA5
4995 34.18023 3 10412 ICMX
5284 34.18023 3 11160 RSA5
1784 31.03953 3 3128 IRMX
3669 31.03953 3 6901 ICMX
3964 25.55263 3 7646 ICMX
12407 25.55263 4 25776 RSA5
875 21.73580 3 1615 ICMX
1563 21.73580 3 2736 IRMX
1602 21.73580 3 2804 IRMX
3448 21.73580 3 6405 RSA5
4700 21.73580 3 9661 RSA5
9239 21.73580 4 20073 ICMX
4492 20.66737 3 9093 RSA5
13662 19.53400 3 27869 IRMX
5125 19.05720 3 10759 IRMX
3987 15.37260 3 7704 IRMX
7786 14.19870 3 17168 ICMX
7893 13.97167 3 17408 RSA5
6008 12.64667 3 12931 ICMX
6444 12.64667 3 13980 RSA3
6589 12.64667 3 14372 RSA5
6599 12.64667 3 14401 RSA5
6696 12.30283 3 14648 ICMX
7335 12.30283 3 16179 RSA5
9984 12.30283 3 21527 ICMX
5818 11.93540 3 12473 RSA5
8315 11.93540 3 18254 RSA3
12923 11.93540 3 26627 RSA5
4177 11.82093 3 8265 IRMX
5145 11.82093 4 10795 IRMX
4544 11.56500 5 9243 IRMX
6058 11.56500 6 13057 ICMX
3031 11.14353 3 5506 RSA5
4590 11.14353 3 9370 ICMX
2154 10.89000 4 3744 IRMX
6248 10.01617 4 13532 ICMX
Show the code
tmap_mode('view')

filtered_zoning %>%
  select(rf_predictions,
         n_contig,
         OBJECTID,
         CODE) %>%
  filter(rf_predictions > 10,
         n_contig > 2) %>%
tm_shape() +
        tm_polygons(col = "rf_predictions", border.alpha = 0, colorNA = "lightgrey", palette = "viridis", style = "fisher",
                    popup.vars = c('rf_predictions', 'n_contig', 'CODE'), alpha = 0.5) +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

5.4 2024 Predictions

Just for shits and giggles, throw in 2024 predictions. (Can use data from 2023.)

Show the code
# need to create a new dataframe with year = 2024, spatial and temporal lag based on 2023, and most recent ACS data (2022)

6 Conclusion

7 Appendices